library(MASS)
library(tidyverse)
library(readr)
library(psych)
library(ggplot2)
library(dplyr)
library(corrplot)
library(RColorBrewer)
library(gridExtra)
library(caret)
library(pROC)
library(car)
# library(MXM)
# library(parallel)
# library(doParallel)TFG Codigo
Librerias
Base de datos
setwd("C:\\Users\\diego\\OneDrive\\Escritorio\\UCM\\Cuarto\\Segundo Cuatri")
datos <- read.csv(file = "application_data.csv")Depuracion de datos
primero vemos cuantas observaciones faltantes hay por columna
data.frame(sort(colSums(is.na(datos)))) sort.colSums.is.na.datos...
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
NAME_TYPE_SUITE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
DAYS_ID_PUBLISH 0
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
FLAG_EMAIL 0
OCCUPATION_TYPE 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_LIVE_REGION 0
REG_REGION_NOT_WORK_REGION 0
LIVE_REGION_NOT_WORK_REGION 0
REG_CITY_NOT_LIVE_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_CITY_NOT_WORK_CITY 0
ORGANIZATION_TYPE 0
FONDKAPREMONT_MODE 0
HOUSETYPE_MODE 0
WALLSMATERIAL_MODE 0
EMERGENCYSTATE_MODE 0
FLAG_DOCUMENT_2 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_10 0
FLAG_DOCUMENT_11 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_21 0
DAYS_LAST_PHONE_CHANGE 1
CNT_FAM_MEMBERS 2
AMT_ANNUITY 12
AMT_GOODS_PRICE 278
EXT_SOURCE_2 660
OBS_30_CNT_SOCIAL_CIRCLE 1021
DEF_30_CNT_SOCIAL_CIRCLE 1021
OBS_60_CNT_SOCIAL_CIRCLE 1021
DEF_60_CNT_SOCIAL_CIRCLE 1021
AMT_REQ_CREDIT_BUREAU_HOUR 41519
AMT_REQ_CREDIT_BUREAU_DAY 41519
AMT_REQ_CREDIT_BUREAU_WEEK 41519
AMT_REQ_CREDIT_BUREAU_MON 41519
AMT_REQ_CREDIT_BUREAU_QRT 41519
AMT_REQ_CREDIT_BUREAU_YEAR 41519
EXT_SOURCE_3 60965
TOTALAREA_MODE 148431
YEARS_BEGINEXPLUATATION_AVG 150007
YEARS_BEGINEXPLUATATION_MODE 150007
YEARS_BEGINEXPLUATATION_MEDI 150007
FLOORSMAX_AVG 153020
FLOORSMAX_MODE 153020
FLOORSMAX_MEDI 153020
LIVINGAREA_AVG 154350
LIVINGAREA_MODE 154350
LIVINGAREA_MEDI 154350
ENTRANCES_AVG 154828
ENTRANCES_MODE 154828
ENTRANCES_MEDI 154828
APARTMENTS_AVG 156061
APARTMENTS_MODE 156061
APARTMENTS_MEDI 156061
ELEVATORS_AVG 163891
ELEVATORS_MODE 163891
ELEVATORS_MEDI 163891
NONLIVINGAREA_AVG 169682
NONLIVINGAREA_MODE 169682
NONLIVINGAREA_MEDI 169682
EXT_SOURCE_1 173378
BASEMENTAREA_AVG 179943
BASEMENTAREA_MODE 179943
BASEMENTAREA_MEDI 179943
LANDAREA_AVG 182590
LANDAREA_MODE 182590
LANDAREA_MEDI 182590
OWN_CAR_AGE 202929
YEARS_BUILD_AVG 204488
YEARS_BUILD_MODE 204488
YEARS_BUILD_MEDI 204488
FLOORSMIN_AVG 208642
FLOORSMIN_MODE 208642
FLOORSMIN_MEDI 208642
LIVINGAPARTMENTS_AVG 210199
LIVINGAPARTMENTS_MODE 210199
LIVINGAPARTMENTS_MEDI 210199
NONLIVINGAPARTMENTS_AVG 213514
NONLIVINGAPARTMENTS_MODE 213514
NONLIVINGAPARTMENTS_MEDI 213514
COMMONAREA_AVG 214865
COMMONAREA_MODE 214865
COMMONAREA_MEDI 214865
ahora tenemos que ver que hacemos con esas observaciones, hay 2 opciones, eliminar aquellas observaciones o sistituir los valores aplicando reglas sustitutivas
# Calcular el porcentaje de valores nulos por columna
null_datos_df <- datos |>
summarise(across(everything(), ~ sum(is.na(.)) * 100 / n())) |> # control + shift + m
pivot_longer(cols = everything(), names_to = "Column_Name", values_to = "Null_Values_Percentage")
# Crear el gráfico de puntos
ggplot(null_datos_df, aes(x = reorder(Column_Name, -Null_Values_Percentage), y = Null_Values_Percentage)) +
geom_point(color = "blue") +
geom_hline(yintercept = 40, linetype = "dashed", color = "red") + # Línea de referencia al 40%
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7)) +
labs(title = "Percentage of Missing Values in Application Data",
x = "Columns",
y = "Null Values Percentage")Variables con mas de un 40 % de datos faltantes
# que columnas tienen mas del 40 % de sus datos missing o NA
# Filtrar columnas con 40% o más de valores nulos
# ponemos como limite un 40 % de datos faltantes, porque sistituir mas de un 40 - 50 % de datos faltantes
# con la mediana o media no es buena idea teniendo tanto % de datos faltantes
nullcol_40_application <- null_datos_df |>
filter(Null_Values_Percentage >= 40)
# Mostrar el resultado
print(nullcol_40_application)# A tibble: 45 × 2
Column_Name Null_Values_Percentage
<chr> <dbl>
1 OWN_CAR_AGE 66.0
2 EXT_SOURCE_1 56.4
3 APARTMENTS_AVG 50.7
4 BASEMENTAREA_AVG 58.5
5 YEARS_BEGINEXPLUATATION_AVG 48.8
6 YEARS_BUILD_AVG 66.5
7 COMMONAREA_AVG 69.9
8 ELEVATORS_AVG 53.3
9 ENTRANCES_AVG 50.3
10 FLOORSMAX_AVG 49.8
# ℹ 35 more rows
Datos faltantes
cuantos datos faltantes tenemos por columna
categorical_columns <- c('NAME_CONTRACT_TYPE', 'CODE_GENDER', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION', 'REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY')
contact_col <- c("FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE",
"FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL")
col_Doc <- c("FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6",
"FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11",
"FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16",
"FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21")
ext <- c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")
data.frame(sort(colSums(is.na(datos)))) sort.colSums.is.na.datos...
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
NAME_TYPE_SUITE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
DAYS_ID_PUBLISH 0
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
FLAG_EMAIL 0
OCCUPATION_TYPE 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_LIVE_REGION 0
REG_REGION_NOT_WORK_REGION 0
LIVE_REGION_NOT_WORK_REGION 0
REG_CITY_NOT_LIVE_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_CITY_NOT_WORK_CITY 0
ORGANIZATION_TYPE 0
FONDKAPREMONT_MODE 0
HOUSETYPE_MODE 0
WALLSMATERIAL_MODE 0
EMERGENCYSTATE_MODE 0
FLAG_DOCUMENT_2 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_10 0
FLAG_DOCUMENT_11 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_21 0
DAYS_LAST_PHONE_CHANGE 1
CNT_FAM_MEMBERS 2
AMT_ANNUITY 12
AMT_GOODS_PRICE 278
EXT_SOURCE_2 660
OBS_30_CNT_SOCIAL_CIRCLE 1021
DEF_30_CNT_SOCIAL_CIRCLE 1021
OBS_60_CNT_SOCIAL_CIRCLE 1021
DEF_60_CNT_SOCIAL_CIRCLE 1021
AMT_REQ_CREDIT_BUREAU_HOUR 41519
AMT_REQ_CREDIT_BUREAU_DAY 41519
AMT_REQ_CREDIT_BUREAU_WEEK 41519
AMT_REQ_CREDIT_BUREAU_MON 41519
AMT_REQ_CREDIT_BUREAU_QRT 41519
AMT_REQ_CREDIT_BUREAU_YEAR 41519
EXT_SOURCE_3 60965
TOTALAREA_MODE 148431
YEARS_BEGINEXPLUATATION_AVG 150007
YEARS_BEGINEXPLUATATION_MODE 150007
YEARS_BEGINEXPLUATATION_MEDI 150007
FLOORSMAX_AVG 153020
FLOORSMAX_MODE 153020
FLOORSMAX_MEDI 153020
LIVINGAREA_AVG 154350
LIVINGAREA_MODE 154350
LIVINGAREA_MEDI 154350
ENTRANCES_AVG 154828
ENTRANCES_MODE 154828
ENTRANCES_MEDI 154828
APARTMENTS_AVG 156061
APARTMENTS_MODE 156061
APARTMENTS_MEDI 156061
ELEVATORS_AVG 163891
ELEVATORS_MODE 163891
ELEVATORS_MEDI 163891
NONLIVINGAREA_AVG 169682
NONLIVINGAREA_MODE 169682
NONLIVINGAREA_MEDI 169682
EXT_SOURCE_1 173378
BASEMENTAREA_AVG 179943
BASEMENTAREA_MODE 179943
BASEMENTAREA_MEDI 179943
LANDAREA_AVG 182590
LANDAREA_MODE 182590
LANDAREA_MEDI 182590
OWN_CAR_AGE 202929
YEARS_BUILD_AVG 204488
YEARS_BUILD_MODE 204488
YEARS_BUILD_MEDI 204488
FLOORSMIN_AVG 208642
FLOORSMIN_MODE 208642
FLOORSMIN_MEDI 208642
LIVINGAPARTMENTS_AVG 210199
LIVINGAPARTMENTS_MODE 210199
LIVINGAPARTMENTS_MEDI 210199
NONLIVINGAPARTMENTS_AVG 213514
NONLIVINGAPARTMENTS_MODE 213514
NONLIVINGAPARTMENTS_MEDI 213514
COMMONAREA_AVG 214865
COMMONAREA_MODE 214865
COMMONAREA_MEDI 214865
# Convertir las columnas a factor (categóricas)
datos[categorical_columns] <- lapply(datos[categorical_columns], as.factor)Factorizamos las variables contacto y otras que sean necesarias
datos <- datos %>%
mutate(across(all_of(contact_col), as.factor)) %>%
mutate(across(all_of(col_Doc), as.factor))variables categoricas
con pocos datos faltantes (moda)
# Función para imputar valores faltantes con la moda
imputar_moda <- function(x) {
if (is.factor(x) | is.character(x)) { # Verifica si es categórica
moda <- names(sort(table(x), decreasing = TRUE))[1] # Encuentra la moda
x[is.na(x)] <- moda # Reemplaza los NA con la moda
}
return(x)
}#categorical_columns <- c(categorical_columns,"AMT_INCOME_RANGE")
# Aplicar la función a todas las columnas categóricas
datos[categorical_columns] <- lapply(datos[categorical_columns], imputar_moda)variables numericas
para sustituir aquellas variables que son numericas y tienen una observacion faltante, haremos uso de la media.
distribucion_variables_numericas <- function(datos) {
numeric_columns <- datos |> select_if(is.numeric) |> names() # Selecciona las variables numéricas
for (col in numeric_columns) {
cat("\n-------------------------------------------------\n")
cat("Distribución de la variable:", col, "\n")
cat("-------------------------------------------------\n")
print(summary(datos[[col]])) # Resumen estadístico
hist(datos[[col]], main = paste("Histograma de", col), col = "skyblue", border = "white", xlab = col)
# Test de Kolmogorov-Smirnov para normalidad
ks_test <- ks.test(datos[[col]], "pnorm", mean(datos[[col]], na.rm = TRUE), sd(datos[[col]], na.rm = TRUE))
cat("\nTest de Kolmogorov-Smirnov para la normalidad:\n")
print(ks_test)
if (ks_test$p.value < 0.05) {
cat("❌ La variable", col, "NO sigue una distribución normal (p <", ks_test$p.value, ")\n")
} else {
cat("✅ La variable", col, "SIGUE una distribución normal (p =", ks_test$p.value, ")\n")
}
}
}
# Llamada a la función
distribucion_variables_numericas(datos)
-------------------------------------------------
Distribución de la variable: SK_ID_CURR
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
100002 189146 278202 278181 367143 456255
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.057265, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable SK_ID_CURR NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: TARGET
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.08073 0.00000 1.00000
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.53579, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable TARGET NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: CNT_CHILDREN
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4171 1.0000 19.0000
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.41858, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable CNT_CHILDREN NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_INCOME_TOTAL
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
25650 112500 147150 168798 202500 117000000
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.30171, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_INCOME_TOTAL NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_CREDIT
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
45000 270000 513531 599026 808650 4050000
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.11015, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_CREDIT NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_ANNUITY
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1616 16524 24903 27109 34596 258026 12
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.0789, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_ANNUITY NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_GOODS_PRICE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
40500 238500 450000 538396 679500 4050000 278
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.14269, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_GOODS_PRICE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: REGION_POPULATION_RELATIVE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00029 0.01001 0.01885 0.02087 0.02866 0.07251
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.11345, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable REGION_POPULATION_RELATIVE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DAYS_BIRTH
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
-25229 -19682 -15750 -16037 -12413 -7489
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.048582, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DAYS_BIRTH NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DAYS_EMPLOYED
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
-17912 -2760 -1213 63815 -289 365243
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.49419, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DAYS_EMPLOYED NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DAYS_REGISTRATION
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
-24672 -7480 -4504 -4986 -2010 0
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.078483, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DAYS_REGISTRATION NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DAYS_ID_PUBLISH
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
-7197 -4299 -3254 -2994 -1720 0
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.12221, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DAYS_ID_PUBLISH NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: OWN_CAR_AGE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 5.00 9.00 12.06 15.00 91.00 202929
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.16271, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable OWN_CAR_AGE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: CNT_FAM_MEMBERS
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.000 2.000 2.000 2.153 3.000 20.000 2
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.30217, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable CNT_FAM_MEMBERS NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: HOUR_APPR_PROCESS_START
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 10.00 12.00 12.06 14.00 23.00
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.08234, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable HOUR_APPR_PROCESS_START NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: REG_REGION_NOT_LIVE_REGION
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01514 0.00000 1.00000
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.5342, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable REG_REGION_NOT_LIVE_REGION NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: EXT_SOURCE_1
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.01 0.33 0.51 0.50 0.68 0.96 173378
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.044677, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable EXT_SOURCE_1 NO sigue una distribución normal (p < 5.58411e-233 )
-------------------------------------------------
Distribución de la variable: EXT_SOURCE_2
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.3925 0.5660 0.5144 0.6636 0.8550 660
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.10691, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable EXT_SOURCE_2 NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: EXT_SOURCE_3
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.37 0.54 0.51 0.67 0.90 60965
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.061755, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable EXT_SOURCE_3 NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: APARTMENTS_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.06 0.09 0.12 0.15 1.00 156061
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.1668, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable APARTMENTS_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: BASEMENTAREA_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.04 0.08 0.09 0.11 1.00 179943
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.14167, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable BASEMENTAREA_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.98 0.98 0.98 0.99 1.00 150007
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.39064, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BEGINEXPLUATATION_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BUILD_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.69 0.76 0.75 0.82 1.00 204488
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.051642, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BUILD_AVG NO sigue una distribución normal (p < 4.560853e-239 )
-------------------------------------------------
Distribución de la variable: COMMONAREA_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.01 0.02 0.04 0.05 1.00 214865
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.27866, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable COMMONAREA_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ELEVATORS_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.08 0.12 1.00 163891
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.3181, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ELEVATORS_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ENTRANCES_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.07 0.14 0.15 0.21 1.00 154828
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.19338, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ENTRANCES_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMAX_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.17 0.17 0.23 0.33 1.00 153020
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.27317, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMAX_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMIN_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.08 0.21 0.23 0.38 1.00 208642
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.22705, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMIN_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LANDAREA_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.02 0.05 0.07 0.09 1.00 182590
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.20694, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LANDAREA_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.08 0.10 0.12 1.00 210199
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.17467, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAPARTMENTS_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAREA_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.07 0.11 0.13 1.00 154350
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.18232, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAREA_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.01 0.00 1.00 213514
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.42679, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAPARTMENTS_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_AVG
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.03 0.03 1.00 169682
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.34168, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAREA_AVG NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: APARTMENTS_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.08 0.11 0.14 1.00 156061
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.17123, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable APARTMENTS_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: BASEMENTAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.04 0.07 0.09 0.11 1.00 179943
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.14955, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable BASEMENTAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.98 0.98 0.98 0.99 1.00 150007
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.39761, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BEGINEXPLUATATION_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BUILD_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.70 0.76 0.76 0.82 1.00 204488
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.054756, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BUILD_MODE NO sigue una distribución normal (p < 1.021391e-268 )
-------------------------------------------------
Distribución de la variable: COMMONAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.01 0.02 0.04 0.05 1.00 214865
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.28379, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable COMMONAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ELEVATORS_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.07 0.12 1.00 163891
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.33652, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ELEVATORS_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ENTRANCES_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.07 0.14 0.15 0.21 1.00 154828
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.204, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ENTRANCES_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMAX_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.17 0.17 0.22 0.33 1.00 153020
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.28906, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMAX_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMIN_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.08 0.21 0.23 0.38 1.00 208642
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.23649, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMIN_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LANDAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.02 0.05 0.06 0.08 1.00 182590
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.21343, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LANDAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.08 0.11 0.13 1.00 210199
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.17894, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAPARTMENTS_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.04 0.07 0.11 0.13 1.00 154350
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.19075, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.01 0.00 1.00 213514
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.43073, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAPARTMENTS_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.03 0.02 1.00 169682
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.35025, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: APARTMENTS_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.06 0.09 0.12 0.15 1.00 156061
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.16968, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable APARTMENTS_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: BASEMENTAREA_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.04 0.08 0.09 0.11 1.00 179943
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.14225, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable BASEMENTAREA_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.98 0.98 0.98 0.99 1.00 150007
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.39156, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BEGINEXPLUATATION_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: YEARS_BUILD_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.69 0.76 0.76 0.83 1.00 204488
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.051814, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable YEARS_BUILD_MEDI NO sigue una distribución normal (p < 1.165368e-240 )
-------------------------------------------------
Distribución de la variable: COMMONAREA_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.01 0.02 0.04 0.05 1.00 214865
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.27905, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable COMMONAREA_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ELEVATORS_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.08 0.12 1.00 163891
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.32521, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ELEVATORS_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: ENTRANCES_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.07 0.14 0.15 0.21 1.00 154828
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.19915, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable ENTRANCES_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMAX_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.17 0.17 0.23 0.33 1.00 153020
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.28113, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMAX_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: FLOORSMIN_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.08 0.21 0.23 0.38 1.00 208642
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.23289, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable FLOORSMIN_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LANDAREA_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.02 0.05 0.07 0.09 1.00 182590
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.20683, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LANDAREA_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.08 0.10 0.12 1.00 210199
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.17714, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAPARTMENTS_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: LIVINGAREA_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.05 0.07 0.11 0.13 1.00 154350
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.18396, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable LIVINGAREA_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.01 0.00 1.00 213514
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.42761, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAPARTMENTS_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_MEDI
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.03 0.03 1.00 169682
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.34369, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable NONLIVINGAREA_MEDI NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: TOTALAREA_MODE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.04 0.07 0.10 0.13 1.00 148431
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.18429, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable TOTALAREA_MODE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: OBS_30_CNT_SOCIAL_CIRCLE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 0.000 0.000 1.422 2.000 348.000 1021
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.27681, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable OBS_30_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DEF_30_CNT_SOCIAL_CIRCLE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 0.0000 0.1434 0.0000 34.0000 1021
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.51118, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DEF_30_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: OBS_60_CNT_SOCIAL_CIRCLE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 0.000 0.000 1.405 2.000 344.000 1021
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.27743, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable OBS_60_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DEF_60_CNT_SOCIAL_CIRCLE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 0.0 0.0 0.1 0.0 24.0 1021
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.52471, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DEF_60_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: DAYS_LAST_PHONE_CHANGE
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-4292.0 -1570.0 -757.0 -962.9 -274.0 0.0 1
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.1221, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable DAYS_LAST_PHONE_CHANGE NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_HOUR
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.01 0.00 4.00 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.52432, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_HOUR NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_DAY
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.01 0.00 9.00 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.5196, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_DAY NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_WEEK
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.03 0.00 8.00 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.53457, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_WEEK NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_MON
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.27 0.00 27.00 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.45031, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_MON NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_QRT
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 0.00 0.27 0.00 261.00 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.4408, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_QRT NO sigue una distribución normal (p < 0 )
-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_YEAR
-------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 0.0 1.0 1.9 3.0 25.0 41519
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test
Test de Kolmogorov-Smirnov para la normalidad:
Asymptotic one-sample Kolmogorov-Smirnov test
data: datos[[col]]
D = 0.19321, p-value < 2.2e-16
alternative hypothesis: two-sided
❌ La variable AMT_REQ_CREDIT_BUREAU_YEAR NO sigue una distribución normal (p < 0 )
# Función para imputar valores faltantes con la media
imputar_mediana <- function(x) {
if (is.numeric(x)) { # Verifica si es numérica
x[is.na(x)] <- median(x, na.rm = TRUE) # Calcula y reemplaza con la media
}
return(x)
}numeric_columns <- datos |> select_if(is.numeric) |> names()
# Aplicar la función a todas las columnas numéricas
datos[numeric_columns] <- lapply(datos[numeric_columns], imputar_mediana)data.frame(sort(colSums(is.na(datos)))) sort.colSums.is.na.datos...
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
AMT_ANNUITY 0
AMT_GOODS_PRICE 0
NAME_TYPE_SUITE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
DAYS_ID_PUBLISH 0
OWN_CAR_AGE 0
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
FLAG_EMAIL 0
OCCUPATION_TYPE 0
CNT_FAM_MEMBERS 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_LIVE_REGION 0
REG_REGION_NOT_WORK_REGION 0
LIVE_REGION_NOT_WORK_REGION 0
REG_CITY_NOT_LIVE_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_CITY_NOT_WORK_CITY 0
ORGANIZATION_TYPE 0
EXT_SOURCE_1 0
EXT_SOURCE_2 0
EXT_SOURCE_3 0
APARTMENTS_AVG 0
BASEMENTAREA_AVG 0
YEARS_BEGINEXPLUATATION_AVG 0
YEARS_BUILD_AVG 0
COMMONAREA_AVG 0
ELEVATORS_AVG 0
ENTRANCES_AVG 0
FLOORSMAX_AVG 0
FLOORSMIN_AVG 0
LANDAREA_AVG 0
LIVINGAPARTMENTS_AVG 0
LIVINGAREA_AVG 0
NONLIVINGAPARTMENTS_AVG 0
NONLIVINGAREA_AVG 0
APARTMENTS_MODE 0
BASEMENTAREA_MODE 0
YEARS_BEGINEXPLUATATION_MODE 0
YEARS_BUILD_MODE 0
COMMONAREA_MODE 0
ELEVATORS_MODE 0
ENTRANCES_MODE 0
FLOORSMAX_MODE 0
FLOORSMIN_MODE 0
LANDAREA_MODE 0
LIVINGAPARTMENTS_MODE 0
LIVINGAREA_MODE 0
NONLIVINGAPARTMENTS_MODE 0
NONLIVINGAREA_MODE 0
APARTMENTS_MEDI 0
BASEMENTAREA_MEDI 0
YEARS_BEGINEXPLUATATION_MEDI 0
YEARS_BUILD_MEDI 0
COMMONAREA_MEDI 0
ELEVATORS_MEDI 0
ENTRANCES_MEDI 0
FLOORSMAX_MEDI 0
FLOORSMIN_MEDI 0
LANDAREA_MEDI 0
LIVINGAPARTMENTS_MEDI 0
LIVINGAREA_MEDI 0
NONLIVINGAPARTMENTS_MEDI 0
NONLIVINGAREA_MEDI 0
FONDKAPREMONT_MODE 0
HOUSETYPE_MODE 0
TOTALAREA_MODE 0
WALLSMATERIAL_MODE 0
EMERGENCYSTATE_MODE 0
OBS_30_CNT_SOCIAL_CIRCLE 0
DEF_30_CNT_SOCIAL_CIRCLE 0
OBS_60_CNT_SOCIAL_CIRCLE 0
DEF_60_CNT_SOCIAL_CIRCLE 0
DAYS_LAST_PHONE_CHANGE 0
FLAG_DOCUMENT_2 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_5 0
FLAG_DOCUMENT_6 0
FLAG_DOCUMENT_7 0
FLAG_DOCUMENT_8 0
FLAG_DOCUMENT_9 0
FLAG_DOCUMENT_10 0
FLAG_DOCUMENT_11 0
FLAG_DOCUMENT_12 0
FLAG_DOCUMENT_13 0
FLAG_DOCUMENT_14 0
FLAG_DOCUMENT_15 0
FLAG_DOCUMENT_16 0
FLAG_DOCUMENT_17 0
FLAG_DOCUMENT_18 0
FLAG_DOCUMENT_19 0
FLAG_DOCUMENT_20 0
FLAG_DOCUMENT_21 0
AMT_REQ_CREDIT_BUREAU_HOUR 0
AMT_REQ_CREDIT_BUREAU_DAY 0
AMT_REQ_CREDIT_BUREAU_WEEK 0
AMT_REQ_CREDIT_BUREAU_MON 0
AMT_REQ_CREDIT_BUREAU_QRT 0
AMT_REQ_CREDIT_BUREAU_YEAR 0
Estandarizar valores
Primero pasamos las columnas con dias negativos a positivos
# Lista de columnas con días negativos
date_col <- c("DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH")
# Convertir valores negativos a positivos en todas las columnas de la lista
datos[date_col] <- abs(datos[date_col])Ahora vamos a organizar a las personas segun su nivel de ingresos (Dicotomizamos)
# Dividir AMT_INCOME_TOTAL por 100,000
datos$AMT_INCOME_TOTAL <- datos$AMT_INCOME_TOTAL / 100000
# Definir los límites de los bins
bins <- c(0,1,2,3,4,5,6,7,8,9,10,11)
# Definir las etiquetas para los rangos de ingresos
slot <- c('0-100K','100K-200K', '200K-300K','300K-400K','400K-500K',
'500K-600K','600K-700K','700K-800K','800K-900K','900K-1M', '1M Above')
# Crear la nueva variable categórica usando cut()
datos$AMT_INCOME_RANGE <- cut(datos$AMT_INCOME_TOTAL, breaks = bins, labels = slot, include.lowest = TRUE)
# Calcular la frecuencia relativa (%) de cada categoría en AMT_INCOME_RANGE
prop.table(table(datos$AMT_INCOME_RANGE)) * 100
0-100K 100K-200K 200K-300K 300K-400K 400K-500K 500K-600K
20.729695163 50.734999788 21.210691261 4.776115517 1.744668526 0.356353672
600K-700K 700K-800K 800K-900K 900K-1M 1M Above
0.282804878 0.052720817 0.096980269 0.009112240 0.005857869
Relaizamos lo mismo para la cantida de credito, la edad y las horas trabajadas para facilitar las comparaciones en el futuro
# Dividir AMT_CREDIT por 100,000
datos$AMT_CREDIT <- datos$AMT_CREDIT / 100000
# Definir los límites de los bins
bins <- c(0,1,2,3,4,5,6,7,8,9,10,100)
# Definir las etiquetas para los rangos de crédito
slots <- c('0-100K','100K-200K', '200K-300K','300K-400K','400K-500K',
'500K-600K','600K-700K','700K-800K','800K-900K','900K-1M', '1M Above')
# Crear la nueva variable categórica
datos$AMT_CREDIT_RANGE <- cut(datos$AMT_CREDIT, breaks = bins, labels = slots, include.lowest = TRUE)
# Calcular la frecuencia relativa (%) de cada categoría en AMT_CREDIT_RANGE
prop.table(table(datos$AMT_CREDIT_RANGE)) * 100
0-100K 100K-200K 200K-300K 300K-400K 400K-500K 500K-600K 600K-700K 700K-800K
1.952450 9.801275 17.824728 8.564897 10.418489 11.131960 7.820533 6.241403
800K-900K 900K-1M 1M Above
7.086576 2.902986 16.254703
# Crear la variable AGE a partir de DAYS_BIRTH
datos$AGE <- floor(abs(datos$DAYS_BIRTH) / 365)
# Definir los límites de los bins
bins <- c(0, 20, 30, 40, 50, 100)
# Definir las etiquetas para los grupos de edad
slots <- c('0-20', '20-30', '30-40', '40-50', '50 above')
# Crear la nueva variable categórica
datos$AGE_GROUP <- cut(datos$AGE, breaks = bins, labels = slots, include.lowest = TRUE)
# Calcular la frecuencia relativa (%) de cada categoría en AGE_GROUP
prop.table(table(datos$AGE_GROUP)) * 100
0-20 20-30 30-40 40-50 50 above
3.251916e-04 1.717174e+01 2.702895e+01 2.419458e+01 3.160440e+01
datos$AGE <- floor(abs(datos$DAYS_BIRTH) / 365)# Crear la variable YEARS_EMPLOYED a partir de DAYS_EMPLOYED
datos$YEARS_EMPLOYED <- floor(abs(datos$DAYS_EMPLOYED) / 365)
# Definir los límites de los bins
bins <- c(0, 5, 10, 20, 30, 40, 50, 60, 150)
# Definir las etiquetas para los grupos de años de empleo
slots <- c('0-5', '5-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60 above')
# Crear la nueva variable categórica
datos$EMPLOYMENT_YEAR <- cut(datos$YEARS_EMPLOYED, breaks = bins, labels = slots, include.lowest = TRUE)
# Calcular la frecuencia relativa (%) de cada categoría en EMPLOYMENT_YEAR
prop.table(table(datos$EMPLOYMENT_YEAR)) * 100
0-5 5-10 10-20 20-30 30-40 40-50
60.49806256 22.20340529 12.95248218 3.33509164 0.94155162 0.06940671
50-60 60 above
0.00000000 0.00000000
Se lleva a cabo esto para poder facilitar la comparacion entre observaciones y la clasificacion de modelos. Viendo la diferencia entre los distintos grupos
L1 PENALTY PARA LA REGRESION USAR apuntaría brevemente en cada caso, que puedes hacer para seguir
Factorial de variables
Variables economicas
economic_vars <- datos[, c("AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE","OWN_CAR_AGE","DAYS_EMPLOYED")]
#"CNT_FAM_MEMBERS" "CNT_CHILDREN"
economic_vars_scaled <- scale(economic_vars)
factor_analysis <- factanal(economic_vars_scaled, factors = 2, rotation = "varimax")
print(factor_analysis, digits = 3, cutoff = 0.3, sort = TRUE)
Call:
factanal(x = economic_vars_scaled, factors = 2, rotation = "varimax")
Uniquenesses:
AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE
0.908 0.020 0.328 0.006
OWN_CAR_AGE DAYS_EMPLOYED
0.999 0.953
Loadings:
Factor1 Factor2
AMT_CREDIT 0.973
AMT_ANNUITY 0.717 0.398
AMT_GOODS_PRICE 0.980
AMT_INCOME_TOTAL
OWN_CAR_AGE
DAYS_EMPLOYED
Factor1 Factor2
SS loadings 2.436 0.351
Proportion Var 0.406 0.059
Cumulative Var 0.406 0.464
Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 671.06 on 4 degrees of freedom.
The p-value is 6.43e-144
print(factor_analysis$loadings)
Loadings:
Factor1 Factor2
AMT_INCOME_TOTAL 0.110 0.283
AMT_CREDIT 0.973 0.182
AMT_ANNUITY 0.717 0.398
AMT_GOODS_PRICE 0.980 0.181
OWN_CAR_AGE
DAYS_EMPLOYED -0.216
Factor1 Factor2
SS loadings 2.436 0.351
Proportion Var 0.406 0.059
Cumulative Var 0.406 0.464
print("------------------------- KMO -----------------------------------")[1] "------------------------- KMO -----------------------------------"
KMO(economic_vars_scaled) # Índice de adecuación muestralKaiser-Meyer-Olkin factor adequacy
Call: KMO(r = economic_vars_scaled)
Overall MSA = 0.7
MSA for each item =
AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE
0.87 0.63 0.97 0.63
OWN_CAR_AGE DAYS_EMPLOYED
0.61 0.70
cortest.bartlett(economic_vars_scaled) # Prueba de esfericidad de BartlettR was not square, finding R from data
$chisq
[1] 1417942
$p.value
[1] 0
$df
[1] 15
print("------------------------ loadings ------------------------------------")[1] "------------------------ loadings ------------------------------------"
loadings <- as.data.frame(factor_analysis$loadings[,1:2])
loadings$Variable <- rownames(loadings)
print("-------------------------- ggplot ----------------------------------")[1] "-------------------------- ggplot ----------------------------------"
pca_result <- prcomp(economic_vars_scaled, scale = TRUE)
screeplot(pca_result, type = "lines", main = "Scree Plot")ggplot(loadings, aes(x = Factor1, y = Factor2, label = Variable)) +
geom_text(size = 5) +
theme_minimal() +
ggtitle("Carga Factorial de Variables Económicas")Valores atipicos
# Definir las variables para analizar outliers
app_outlier_col_1 <- c('AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'DAYS_EMPLOYED')
app_outlier_col_2 <- c('CNT_CHILDREN', 'DAYS_BIRTH')
# Crear boxplots para app_outlier_col_1
plots1 <- lapply(app_outlier_col_1, function(var) {
ggplot(datos, aes(y = .data[[var]])) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = var, y = "") +
theme_minimal()
})
# Crear boxplots para app_outlier_col_2
plots2 <- lapply(app_outlier_col_2, function(var) {
ggplot(datos, aes(y = .data[[var]])) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = var, y = "") +
theme_minimal()
})
# Mostrar todos los gráficos en una sola figura
grid.arrange(grobs = c(plots1, plots2), ncol = 4)#eliminamos la categoria de "60 above" y "50-60" para YEARS_EMPLOYED
datos <- datos[!datos$EMPLOYMENT_YEAR %in% c("50-60", "60 above"), ]
# eliminamos la categoria XNA que tiene 0 observaciones
datos <- datos[datos$CODE_GENDER != "XNA", ]
datos$CODE_GENDER <- droplevels(datos$CODE_GENDER)
# hemos tenido problemas con las personas que estan desempleadas, hay que asignarlas un valor, por tanto las asignamos al valor "0-5"
datos$EMPLOYMENT_YEAR <- ifelse(
datos$NAME_INCOME_TYPE == "Unemployed", "0", as.character(datos$EMPLOYMENT_YEAR))
datos$EMPLOYMENT_YEAR <- as.factor(datos$EMPLOYMENT_YEAR)
# aquellas observaciones que ya no se han podido sustituir ya sea por valores atipicos o causen problemas se eliminan
datos <- na.omit(datos)Tablas de contingencia
tb_conting <- function(df, x, vec){
for(i in seq_along(vec)){
cat("\nTabla de Contingencia para:", vec[i], "\n")
# Crear tabla de contingencia con nombres de filas y columnas
tab <- table(df[[x]], df[[vec[i]]])
dimnames(tab) <- list(TARGET = levels(factor(df[[x]])), Variable = levels(factor(df[[vec[i]]])))
print(tab)
cat("\nTest de Chi-Cuadrado:\n")
chi_test <- chisq.test(tab)
print(chi_test)
cat("\n--------------------------\n")
}
}
# Llamada a la función, suponiendo que df es tu base de datos
tb_conting(datos, "TARGET", contact_col) # Puedes probar con col_Doc o ext también
Tabla de Contingencia para: FLAG_MOBIL
Variable
TARGET 0 1
0 1 230100
1 0 21832
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 1.5365e-21, df = 1, p-value = 1
--------------------------
Tabla de Contingencia para: FLAG_EMP_PHONE
Variable
TARGET 0 1
0 25 230076
1 9 21823
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 11.463, df = 1, p-value = 0.00071
--------------------------
Tabla de Contingencia para: FLAG_WORK_PHONE
Variable
TARGET 0 1
0 174753 55348
1 15931 5901
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 95.769, df = 1, p-value < 2.2e-16
--------------------------
Tabla de Contingencia para: FLAG_CONT_MOBILE
Variable
TARGET 0 1
0 490 229611
1 43 21789
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.17172, df = 1, p-value = 0.6786
--------------------------
Tabla de Contingencia para: FLAG_PHONE
Variable
TARGET 0 1
0 165456 64645
1 16534 5298
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 145.43, df = 1, p-value < 2.2e-16
--------------------------
Tabla de Contingencia para: FLAG_EMAIL
Variable
TARGET 0 1
0 215398 14703
1 20550 1282
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 8.9061, df = 1, p-value = 0.002842
--------------------------
tb_conting(datos, "TARGET", col_Doc) # Puedes probar con col_Doc o ext también
Tabla de Contingencia para: FLAG_DOCUMENT_2
Variable
TARGET 0 1
0 230092 9
1 21828 4
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 5.4752, df = 1, p-value = 0.01929
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_3
Variable
TARGET 0 1
0 55754 174347
1 3938 17894
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 422.58, df = 1, p-value < 2.2e-16
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_4
Variable
TARGET 0 1
0 230081 20
1 21832 0
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.96073, df = 1, p-value = 0.327
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_5
Variable
TARGET 0 1
0 226356 3745
1 21483 349
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.08738, df = 1, p-value = 0.7675
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_6
Variable
TARGET 0 1
0 228050 2051
1 21698 134
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 17.547, df = 1, p-value = 2.803e-05
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_7
Variable
TARGET 0 1
0 230053 48
1 21829 3
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.20952, df = 1, p-value = 0.6471
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_8
Variable
TARGET 0 1
0 207501 22600
1 20016 1816
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 51.344, df = 1, p-value = 7.753e-13
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_9
Variable
TARGET 0 1
0 229018 1083
1 21759 73
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 7.8137, df = 1, p-value = 0.005185
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_10
Variable
TARGET 0 1
0 230095 6
1 21832 0
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.00083793, df = 1, p-value = 0.9769
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_11
Variable
TARGET 0 1
0 228975 1126
1 21757 75
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 8.6318, df = 1, p-value = 0.003304
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_12
Variable
TARGET 0 1
0 230099 2
1 21832 0
Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 3.2817e-25, df = 1, p-value = 1
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_13
Variable
TARGET 0 1
0 229065 1036
1 21803 29
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 46.972, df = 1, p-value = 7.201e-12
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_14
Variable
TARGET 0 1
0 229246 855
1 21802 30
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 30.569, df = 1, p-value = 3.222e-08
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_15
Variable
TARGET 0 1
0 229750 351
1 21821 11
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 13.8, df = 1, p-value = 0.0002033
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_16
Variable
TARGET 0 1
0 227250 2851
1 21682 150
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 51.145, df = 1, p-value = 8.578e-13
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_17
Variable
TARGET 0 1
0 230022 79
1 21830 2
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 3.1868, df = 1, p-value = 0.07424
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_18
Variable
TARGET 0 1
0 227770 2331
1 21690 142
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 26.603, df = 1, p-value = 2.499e-07
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_19
Variable
TARGET 0 1
0 229934 167
1 21820 12
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 0.64071, df = 1, p-value = 0.4235
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_20
Variable
TARGET 0 1
0 229959 142
1 21819 13
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 2.8278e-28, df = 1, p-value = 1
--------------------------
Tabla de Contingencia para: FLAG_DOCUMENT_21
Variable
TARGET 0 1
0 230012 89
1 21818 14
Test de Chi-Cuadrado:
Pearson's Chi-squared test with Yates' continuity correction
data: tab
X-squared = 2.5676, df = 1, p-value = 0.1091
--------------------------
Analisis de Datos
En un principio me interesa saber cuales son las variables mas importantes a la hora de predecir si alguien va a devovler el pago o no, por tanto realizamos un modelo con todas las variables y hacemos el ANOVA para ver cuales son las mas significativas
#anova(lm(TARGET~.,data=datos))
anova_results <- anova(lm(TARGET ~ ., data = datos))
# Ordenar por la suma de cuadrados (Sum Sq) en orden descendente
(anova_sorted <- anova_results[order(-anova_results$`Sum Sq`), ])Analysis of Variance Table
Response: TARGET
Df Sum Sq Mean Sq F value Pr(>F)
Residuals 251667 18566.3 0.07
EXT_SOURCE_3 1 324.4 324.44 4397.8144 < 2.2e-16 ***
EXT_SOURCE_2 1 320.4 320.40 4343.0465 < 2.2e-16 ***
DAYS_BIRTH 1 61.4 61.44 832.7596 < 2.2e-16 ***
AMT_GOODS_PRICE 1 57.0 57.04 773.1698 < 2.2e-16 ***
FLAG_OWN_CAR 1 51.8 51.78 701.8314 < 2.2e-16 ***
EXT_SOURCE_1 1 49.1 49.13 666.0191 < 2.2e-16 ***
CODE_GENDER 1 47.0 47.01 637.2044 < 2.2e-16 ***
DAYS_EMPLOYED 1 42.2 42.18 571.6970 < 2.2e-16 ***
REGION_RATING_CLIENT 2 41.8 20.90 283.3044 < 2.2e-16 ***
NAME_EDUCATION_TYPE 4 39.5 9.89 134.0142 < 2.2e-16 ***
AMT_INCOME_TOTAL 1 29.3 29.31 397.3253 < 2.2e-16 ***
NAME_INCOME_TYPE 7 28.2 4.03 54.6124 < 2.2e-16 ***
AMT_CREDIT_RANGE 10 26.1 2.61 35.3979 < 2.2e-16 ***
NAME_CONTRACT_TYPE 1 26.0 25.98 352.1684 < 2.2e-16 ***
NAME_FAMILY_STATUS 5 23.3 4.65 63.0893 < 2.2e-16 ***
AMT_CREDIT 1 21.6 21.62 293.1216 < 2.2e-16 ***
ORGANIZATION_TYPE 56 20.9 0.37 5.0689 < 2.2e-16 ***
DAYS_ID_PUBLISH 1 18.4 18.45 250.0321 < 2.2e-16 ***
OCCUPATION_TYPE 18 17.3 0.96 12.9906 < 2.2e-16 ***
REGION_POPULATION_RELATIVE 1 14.8 14.80 200.5901 < 2.2e-16 ***
NAME_HOUSING_TYPE 5 11.8 2.36 32.0560 < 2.2e-16 ***
FLAG_WORK_PHONE 1 10.0 9.99 135.4143 < 2.2e-16 ***
DEF_30_CNT_SOCIAL_CIRCLE 1 9.9 9.88 133.9120 < 2.2e-16 ***
REG_CITY_NOT_LIVE_CITY 1 8.0 8.05 109.0894 < 2.2e-16 ***
DAYS_REGISTRATION 1 6.9 6.93 93.9007 < 2.2e-16 ***
REGION_RATING_CLIENT_W_CITY 2 6.7 3.36 45.5134 < 2.2e-16 ***
FLAG_DOCUMENT_3 1 5.3 5.32 72.1300 < 2.2e-16 ***
AGE_GROUP 4 4.8 1.20 16.3149 2.278e-13 ***
AMT_ANNUITY 1 4.7 4.71 63.8379 1.357e-15 ***
EMPLOYMENT_YEAR 5 4.2 0.85 11.4651 4.346e-11 ***
FLAG_PHONE 1 3.6 3.58 48.5729 3.190e-12 ***
OWN_CAR_AGE 1 2.9 2.91 39.3905 3.475e-10 ***
CNT_CHILDREN 1 2.7 2.70 36.5516 1.489e-09 ***
DAYS_LAST_PHONE_CHANGE 1 2.5 2.55 34.5519 4.156e-09 ***
NAME_TYPE_SUITE 7 2.5 0.35 4.7660 2.269e-05 ***
FLAG_DOCUMENT_18 1 2.2 2.19 29.7455 4.931e-08 ***
FLAG_DOCUMENT_16 1 2.0 2.03 27.5070 1.567e-07 ***
WEEKDAY_APPR_PROCESS_START 6 1.7 0.28 3.7869 0.0008957 ***
REG_CITY_NOT_WORK_CITY 1 1.6 1.59 21.5340 3.478e-06 ***
WALLSMATERIAL_MODE 7 1.5 0.22 2.9610 0.0041965 **
HOUR_APPR_PROCESS_START 1 1.2 1.21 16.4395 5.024e-05 ***
AMT_REQ_CREDIT_BUREAU_QRT 1 1.1 1.11 15.0015 0.0001075 ***
APARTMENTS_AVG 1 1.0 1.04 14.0337 0.0001796 ***
FLOORSMAX_AVG 1 1.0 0.97 13.1742 0.0002839 ***
FLAG_DOCUMENT_5 1 0.9 0.93 12.6326 0.0003791 ***
FLAG_DOCUMENT_2 1 0.9 0.92 12.5057 0.0004058 ***
FONDKAPREMONT_MODE 4 0.9 0.22 3.0493 0.0159457 *
AMT_INCOME_RANGE 10 0.9 0.09 1.1879 0.2932210
OBS_30_CNT_SOCIAL_CIRCLE 1 0.8 0.80 10.8391 0.0009939 ***
YEARS_EMPLOYED 1 0.6 0.57 7.7459 0.0053838 **
AMT_REQ_CREDIT_BUREAU_WEEK 1 0.5 0.52 6.9830 0.0082291 **
YEARS_BUILD_AVG 1 0.5 0.48 6.4465 0.0111178 *
FLAG_DOCUMENT_14 1 0.5 0.47 6.4220 0.0112724 *
FLAG_EMAIL 1 0.5 0.45 6.1265 0.0133175 *
EMERGENCYSTATE_MODE 2 0.4 0.22 3.0360 0.0480295 *
FLAG_DOCUMENT_13 1 0.4 0.43 5.8131 0.0159078 *
FLAG_DOCUMENT_8 1 0.4 0.43 5.7647 0.0163520 *
FLAG_CONT_MOBILE 1 0.4 0.42 5.6940 0.0170233 *
YEARS_BEGINEXPLUATATION_AVG 1 0.4 0.36 4.8939 0.0269526 *
NONLIVINGAREA_MODE 1 0.3 0.26 3.5764 0.0586088 .
FLAG_DOCUMENT_15 1 0.2 0.23 3.1349 0.0766360 .
AMT_REQ_CREDIT_BUREAU_MON 1 0.2 0.23 3.1285 0.0769348 .
HOUSETYPE_MODE 3 0.2 0.07 0.9616 0.4097277
COMMONAREA_AVG 1 0.2 0.19 2.5417 0.1108763
FLAG_DOCUMENT_6 1 0.2 0.18 2.4004 0.1213045
FLAG_OWN_REALTY 1 0.2 0.16 2.2237 0.1359122
FLAG_DOCUMENT_9 1 0.2 0.16 2.1733 0.1404240
AGE 1 0.1 0.13 1.8233 0.1769271
ELEVATORS_AVG 1 0.1 0.13 1.8050 0.1791088
DEF_60_CNT_SOCIAL_CIRCLE 1 0.1 0.13 1.7604 0.1845817
FLAG_DOCUMENT_17 1 0.1 0.13 1.7157 0.1902446
BASEMENTAREA_AVG 1 0.1 0.12 1.6214 0.2028973
LIVINGAPARTMENTS_MODE 1 0.1 0.11 1.4871 0.2226666
LIVE_REGION_NOT_WORK_REGION 1 0.1 0.10 1.4045 0.2359668
NONLIVINGAPARTMENTS_MODE 1 0.1 0.10 1.3907 0.2382887
COMMONAREA_MEDI 1 0.1 0.10 1.3692 0.2419498
ENTRANCES_AVG 1 0.1 0.10 1.3633 0.2429630
LIVINGAPARTMENTS_MEDI 1 0.1 0.10 1.3366 0.2476295
LIVE_CITY_NOT_WORK_CITY 1 0.1 0.09 1.2457 0.2643680
LANDAREA_MODE 1 0.1 0.09 1.2198 0.2693984
LANDAREA_MEDI 1 0.1 0.08 1.0381 0.3082754
YEARS_BEGINEXPLUATATION_MEDI 1 0.1 0.08 1.0232 0.3117660
LANDAREA_AVG 1 0.1 0.07 0.9788 0.3224996
OBS_60_CNT_SOCIAL_CIRCLE 1 0.1 0.07 0.9416 0.3318593
FLAG_DOCUMENT_11 1 0.1 0.06 0.8320 0.3617001
ENTRANCES_MODE 1 0.1 0.06 0.8116 0.3676376
BASEMENTAREA_MEDI 1 0.1 0.06 0.7901 0.3740785
FLAG_DOCUMENT_19 1 0.1 0.05 0.7153 0.3976827
FLAG_DOCUMENT_10 1 0.0 0.04 0.5862 0.4438763
LIVINGAREA_MEDI 1 0.0 0.04 0.5707 0.4499938
ELEVATORS_MODE 1 0.0 0.04 0.5581 0.4550310
SK_ID_CURR 1 0.0 0.04 0.5540 0.4567054
YEARS_BUILD_MEDI 1 0.0 0.03 0.4681 0.4938829
FLAG_DOCUMENT_4 1 0.0 0.03 0.4222 0.5158615
NONLIVINGAREA_AVG 1 0.0 0.02 0.3274 0.5671999
FLAG_DOCUMENT_20 1 0.0 0.02 0.3222 0.5702982
LIVINGAREA_AVG 1 0.0 0.02 0.3194 0.5719686
NONLIVINGAPARTMENTS_MEDI 1 0.0 0.02 0.2963 0.5861961
APARTMENTS_MODE 1 0.0 0.02 0.2932 0.5881761
FLOORSMAX_MODE 1 0.0 0.02 0.2930 0.5883071
FLAG_MOBIL 1 0.0 0.02 0.2601 0.6100338
ENTRANCES_MEDI 1 0.0 0.01 0.2030 0.6522734
FLAG_DOCUMENT_7 1 0.0 0.01 0.2027 0.6525228
FLOORSMAX_MEDI 1 0.0 0.01 0.2024 0.6528160
YEARS_BUILD_MODE 1 0.0 0.01 0.1750 0.6757210
AMT_REQ_CREDIT_BUREAU_YEAR 1 0.0 0.01 0.1716 0.6787315
FLAG_DOCUMENT_21 1 0.0 0.01 0.1644 0.6851184
FLOORSMIN_AVG 1 0.0 0.01 0.1484 0.7000980
LIVINGAREA_MODE 1 0.0 0.01 0.1243 0.7244641
TOTALAREA_MODE 1 0.0 0.01 0.0983 0.7539129
FLAG_DOCUMENT_12 1 0.0 0.01 0.0856 0.7698540
FLAG_EMP_PHONE 1 0.0 0.01 0.0801 0.7770983
YEARS_BEGINEXPLUATATION_MODE 1 0.0 0.01 0.0787 0.7791331
ELEVATORS_MEDI 1 0.0 0.00 0.0570 0.8112468
NONLIVINGAREA_MEDI 1 0.0 0.00 0.0412 0.8391954
FLOORSMIN_MODE 1 0.0 0.00 0.0403 0.8408716
APARTMENTS_MEDI 1 0.0 0.00 0.0269 0.8698346
REG_REGION_NOT_LIVE_REGION 1 0.0 0.00 0.0207 0.8855962
LIVINGAPARTMENTS_AVG 1 0.0 0.00 0.0157 0.9004427
AMT_REQ_CREDIT_BUREAU_HOUR 1 0.0 0.00 0.0138 0.9066474
FLOORSMIN_MEDI 1 0.0 0.00 0.0099 0.9208284
COMMONAREA_MODE 1 0.0 0.00 0.0068 0.9340822
BASEMENTAREA_MODE 1 0.0 0.00 0.0038 0.9505744
AMT_REQ_CREDIT_BUREAU_DAY 1 0.0 0.00 0.0004 0.9845057
REG_REGION_NOT_WORK_REGION 1 0.0 0.00 0.0001 0.9910909
NONLIVINGAPARTMENTS_AVG 1 0.0 0.00 0.0001 0.9926658
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EXT_SOURCE_3 AMT_GOODS_PRICE FLAG_OWN_CAR EXT_SOURCE_1 CODE_GENDER DAYS_BIRTH NAME_EDUCATION_TYPE DAYS_EMPLOYED AMT_CREDIT NAME_INCOME_TYPE EXT_SOURCE_2 NAME_CONTRACT_TYPE OCCUPATION_TYPE NAME_FAMILY_STATUS AMT_CREDIT_RANGE
# Contar la frecuencia de cada categoría en la variable TARGET
Imbalance <- as.data.frame(table(datos$TARGET))
colnames(Imbalance) <- c("Loan_Repayment_Status", "Count")
# Reemplazar valores 0 y 1 con etiquetas significativas
Imbalance$Loan_Repayment_Status <- factor(Imbalance$Loan_Repayment_Status,
levels = c(0,1),
labels = c("Repayer", "Defaulter"))
# Crear el gráfico de barras
ggplot(Imbalance, aes(x = Loan_Repayment_Status, y = Count, fill = Loan_Repayment_Status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("green", "red")) +
labs(title = "Imbalance Plotting",
x = "Loan Repayment Status",
y = "Count of Repayers & Defaulters") +
theme_minimal()definimos una funcion que dado una variable nos de un histograma con los pagos devueltos y no devueltos segun la variable
# Definir la función
plot_loan_repayment <- function(df, variable) {
# Verificar que la variable existe
if (!(variable %in% colnames(df))) {
stop("La variable especificada no existe en el dataframe.")
}
# Crear dataframe de trabajo
df_plot <- df[, c(variable, "TARGET")]
# Convertir TARGET a factor con etiquetas
df_plot$TARGET <- factor(df_plot$TARGET, levels = c(0, 1), labels = c("Repayer", "Defaulter"))
# Calcular proporciones por categoría
df_prop <- df_plot %>%
group_by(.data[[variable]], TARGET) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(.data[[variable]]) %>%
mutate(pct = n / sum(n) * 100)
# Graficar con porcentajes
ggplot(df_prop, aes_string(x = variable, y = "pct", fill = "TARGET")) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = paste("Distribución porcentual de", variable, "según estado de pago"),
x = variable, y = "Porcentaje (%)"
) +
scale_fill_manual(values = c("green", "red")) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
theme_minimal()
}
# Definir la función
# plot_loan_repayment <- function(df, variable) {
# # Verificar que la variable existe en el dataframe
# if (!(variable %in% colnames(df))) {
# stop("La variable especificada no existe en el dataframe.")
# }
#
# # Crear un dataframe con la variable seleccionada y la variable TARGET
# df_plot <- df[, c(variable, "TARGET")]
#
# # Convertir TARGET a factor con etiquetas
# df_plot$TARGET <- factor(df_plot$TARGET, levels = c(0,1), labels = c("Repayer", "Defaulter"))
#
# # Crear el gráfico
# ggplot(df_plot, aes_string(x = variable, fill = "TARGET")) +
# geom_bar(position = "dodge") +
# labs(title = paste("Distribución de", variable, "según el estado de pago del préstamo"),
# x = variable,
# y = "Frecuencia") +
# scale_fill_manual(values = c("green", "red")) +
# scale_x_discrete(guide = guide_axis(angle = 45)) +
# theme_minimal()
# }Graficar variables categoricas
# Ejemplo de uso con la variable FLAG_OWN_CAR
plot_loan_repayment(datos, "FLAG_OWN_CAR")Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
plot_loan_repayment(datos, "CODE_GENDER")plot_loan_repayment(datos, "NAME_CONTRACT_TYPE")plot_loan_repayment(datos, "NAME_EDUCATION_TYPE")plot_loan_repayment(datos, "NAME_INCOME_TYPE")plot_loan_repayment(datos, "AMT_CREDIT_RANGE")plot_loan_repayment(datos, "NAME_FAMILY_STATUS")plot_loan_repayment(datos, "ORGANIZATION_TYPE")plot_loan_repayment(datos, "OCCUPATION_TYPE")plot_loan_repayment(datos, "NAME_HOUSING_TYPE")plot_loan_repayment(datos, "EMPLOYMENT_YEAR")plot_loan_repayment(datos, "FLAG_DOCUMENT_3")plot_loan_repayment(datos, "NAME_TYPE_SUITE")Graficar variables continuas
graficar_variable <- function(data, variable) {
# Calcular los porcentajes por clase
porcentajes <- data %>%
group_by(TARGET) %>%
summarise(n = n()) %>%
mutate(porc = paste0(round(100 * n / sum(n), 1), "%"))
# Crear etiquetas personalizadas
levels_target <- sort(unique(data$TARGET))
etiquetas <- paste0(
ifelse(levels_target == 0, "Repayers", "Defaulters"),
" (", porcentajes$porc, ")"
)
# Graficar con los porcentajes en la leyenda
ggplot(data, aes(x = .data[[variable]], color = as.factor(TARGET))) +
geom_density(size = 1) +
labs(x = variable, y = "Densidad", title = paste("Distribución de", variable, "según TARGET")) +
scale_color_manual(
values = c("blue", "red"),
labels = etiquetas,
name = "TARGET"
) +
theme_minimal()
}# Ejemplo de uso con la variable "AMT_CREDIT"
graficar_variable(datos, "AMT_CREDIT")Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# Ejemplo de uso con la variable "AMT_CREDIT"
graficar_variable(datos, "DAYS_BIRTH")graficar_variable(datos, "AMT_GOODS_PRICE")graficar_variable(datos, "DAYS_EMPLOYED")graficar_variable(datos, "DAYS_LAST_PHONE_CHANGE")graficar_variable(datos, "AMT_INCOME_TOTAL")graficar_variable(datos, "AGE")Guardar base de datos depurada para modelos
primero eliminamos las variables menos significativas, y nos quedamos con las mas significativas
variables_significativas <- c("EXT_SOURCE_3", "EXT_SOURCE_2", "DAYS_BIRTH", "AMT_GOODS_PRICE","FLAG_OWN_CAR", "EXT_SOURCE_1", "CODE_GENDER", "NAME_EDUCATION_TYPE", "DAYS_EMPLOYED", "REGION_RATING_CLIENT", "AMT_CREDIT", "NAME_INCOME_TYPE", "NAME_CONTRACT_TYPE", "AMT_CREDIT_RANGE","REGION_POPULATION_RELATIVE", "NAME_HOUSING_TYPE", "FLAG_WORK_PHONE","DEF_30_CNT_SOCIAL_CIRCLE", "REG_CITY_NOT_LIVE_CITY", "DAYS_REGISTRATION", "REGION_RATING_CLIENT_W_CITY", "FLAG_DOCUMENT_3", "AGE_GROUP", "EMPLOYMENT_YEAR", "FLAG_PHONE", "OWN_CAR_AGE", "CNT_CHILDREN", "DAYS_LAST_PHONE_CHANGE", "FLAG_DOCUMENT_18", "NAME_TYPE_SUITE", "FLAG_DOCUMENT_16", "WEEKDAY_APPR_PROCESS_START", "REG_CITY_NOT_WORK_CITY", "AMT_ANNUITY", "WALLSMATERIAL_MODE", "AMT_INCOME_TOTAL", "HOUR_APPR_PROCESS_START", "AMT_REQ_CREDIT_BUREAU_QRT", "APARTMENTS_AVG", "FLOORSMAX_AVG", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_2", "FONDKAPREMONT_MODE", "OBS_30_CNT_SOCIAL_CIRCLE", "YEARS_EMPLOYED","TARGET")
datos<- datos[,variables_significativas]
# eliminamos los NA faltantes, estos se deben a valores atipicos que dan problemas
#guardamos en una base de datos los datos, asi podemos seguir con el TFG sin saturar el PC
save(datos,file="DatosDepurados.RDa")